https://github.com/QuantScientist/Data-Science-ArrayFire-GPU
In [1]:
%reset -f
import pycuda
from pycuda import compiler
import pycuda.driver as drv
import pycuda.driver as cuda
In [3]:
drv.init()
print("%d device(s) found." % drv.Device.count())
for ordinal in range(drv.Device.count()):
dev = drv.Device(ordinal)
print ("Device #%d: %s" % (ordinal, dev.name()))
drv
Out[3]:
In [5]:
import pycuda.autoinit
import numpy
from pycuda.compiler import SourceModule
srcGPU = """
#include <stdio.h>
__global__ void multGPU(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
//dest[i] = threadIdx.x + threadIdx.y + blockDim.x;
//dest[i] = blockDim.x;
//printf("I am %d.%d\\n", threadIdx.x, threadIdx.y);
}
"""
srcGPUModule = SourceModule(srcGPU)
print (srcGPUModule)
In [6]:
ARR_SIZE=16
a = numpy.random.randn(ARR_SIZE).astype(numpy.float32)
a=numpy.ones_like(a)*3
b = numpy.random.randn(ARR_SIZE).astype(numpy.float32)
b=numpy.ones_like(b)*2
dest = numpy.zeros_like(a)
# print dest
In [9]:
multGPUFunc = srcGPUModule.get_function("multGPU")
print (multGPUFunc)
multGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),
block=(ARR_SIZE,32,1))
print (dest)
In [10]:
# print "Calculating %d iterations" % (n_iter)
import timeit
rounds =3
print ('pycuda', timeit.timeit(lambda:
multGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),
grid=(ARR_SIZE,1,1),
block=(1,1,1)),
number=rounds))
# print dest
# print 'pycuda', timeit.timeit(lambda:
# multGPUFunc(drv.Out(dest), drv.In(a), drv.In(b),
# block=(ARR_SIZE,1,1)),
# number=rounds)
# print dest
print ('npy', timeit.timeit(lambda:a*b , number=rounds))
In [11]:
a = numpy.random.randn(4,4)
a=numpy.ones_like(a)
a = a.astype(numpy.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
mod = SourceModule("""
#include <stdio.h>
__global__ void doublify(float *a)
{
int idx = threadIdx.x + threadIdx.y*4;
a[idx] *= 2;
//printf("I am %d.%d\\n", threadIdx.x, threadIdx.y);
printf("I am %dth thread in threadIdx.x:%d.threadIdx.y:%d blockIdx.:%d blockIdx.y:%d blockDim.x:%d blockDim.y:%d\\n",(threadIdx.x+threadIdx.y*blockDim.x+(blockIdx.x*blockDim.x*blockDim.y)+(blockIdx.y*blockDim.x*blockDim.y)),threadIdx.x, threadIdx.y,blockIdx.x,blockIdx.y,blockDim.x,blockDim.y);
}
""")
func = mod.get_function("doublify")
func(a_gpu, block=(16,1,1))
a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print (a_doubled)
[block]
In [ ]:
In [ ]: